# Reading in "control genotype" data
megamuga_controls <- data.table::fread("../data/MMControls/MegaMUGA genotypes CONTROLS.csv")
control_genotypes <- data.table::fread("../data/MMControls/control.genotypes.csv")
## Warning in data.table::fread("../data/MMControls/control.genotypes.csv"):
## Detected 364 column names but the data has 365 columns (i.e. invalid file).
## Added 1 extra default column name for the first column which is guessed to be
## row names or an index. Use setnames() afterwards if this guess is not correct,
## or fix the file write command that created the file to create a valid file.
colnames(control_genotypes)[1] <- "marker"
## Reading in marker annotations
mm_metadata <- data.table::fread("../data/MMControls/mm_uwisc_v2.csv")
## Calculating allele frequencies for each marker
control_allele_freqs <- control_genotypes %>%
tidyr::pivot_longer(-marker, names_to = "sample", values_to = "genotype") %>%
dplyr::group_by(marker, genotype) %>%
dplyr::summarise(n = n()) %>%
dplyr::mutate(freq = round(n/sum(n), 3),
genotype = as.factor(genotype)) %>%
dplyr::left_join(., mm_metadata)
First, we searched for probes where many mice are missing genotype calls.
## Filtering to markers with missing genotypes
no.calls <- control_allele_freqs %>%
dplyr::filter(genotype == "N") %>%
tidyr::pivot_wider(names_from = genotype, values_from = n) %>%
dplyr::select(marker, chr, bp_grcm39, freq) %>%
dplyr::mutate(chr = as.factor(chr))
cutoff <- quantile(no.calls$freq, probs = seq(0,1,0.05))[[20]]
above.cutoff <- no.calls %>%
dplyr::filter(freq > cutoff)
Of 77808 markers, 54990 failed to genotype at least one sample, and 2750 markers failed to genotype at least 12.765% of samples.
In a similar fashion, we calculated the number of reference samples with missing genotypes. Repeated observations of samples/strains meant that genotype counts for each marker among them couldn’t be grouped and tallied, so determining no-call frequency occurred column-wise.
n.calls.strains <- apply(X = control_genotypes[,2:ncol(control_genotypes)], MARGIN = 2, function(x) table(x)[5])
n.calls.strains.df <- data.frame(n.calls.strains)
n.calls.strains.df$sample <- names(n.calls.strains)
n.calls.strains.df %<>%
dplyr::rename(n.no.calls = n.calls.strains)
sampleQC <- ggplot(n.calls.strains.df,
mapping = aes(x = reorder(sample,n.no.calls),
y = n.no.calls,
text = paste("Sample:", sample))) +
geom_point() +
QCtheme +
theme(axis.text.x = element_blank(),
axis.ticks.x = element_blank()) +
labs(x = "Number of mice with missing genotypes",
y = "Number of markers")
ggplotly(sampleQC, tooltip = c("text","y"))
high.n.samples <- n.calls.strains.df %>%
dplyr::filter(n.no.calls > quantile(n.calls.strains.df$n.no.calls, probs = seq(0,1,0.05))[20])
We next validated the sexes of each sample using Chromosome X probe intensities. We paired up probe intensities, joined available metadata, and filtered down to only markers covering the X chromosome.
## Reading in genotype intensities
x_intensities <- data.table::fread("../data/MMControls/control.X.csv")
## Warning in data.table::fread("../data/MMControls/control.X.csv"): Detected 364
## column names but the data has 365 columns (i.e. invalid file). Added 1 extra
## default column name for the first column which is guessed to be row names or an
## index. Use setnames() afterwards if this guess is not correct, or fix the file
## write command that created the file to create a valid file.
colnames(x_intensities)[1] <- "marker"
y_intensities <- data.table::fread("../data/MMControls/control.Y.csv")
## Warning in data.table::fread("../data/MMControls/control.Y.csv"): Detected 364
## column names but the data has 365 columns (i.e. invalid file). Added 1 extra
## default column name for the first column which is guessed to be row names or an
## index. Use setnames() afterwards if this guess is not correct, or fix the file
## write command that created the file to create a valid file.
colnames(y_intensities)[1] <- "marker"
## Check to see if dimensions of intensity tables are identical, marker orders identical, and sample orders identical
if((unique(dim(x_intensities) == dim(y_intensities)) &&
unique(colnames(x_intensities) == colnames(y_intensities)) &&
unique(x_intensities$marker == y_intensities$marker)) == TRUE){
## Pivoting the data longer
x_int_long <- x_intensities %>%
tidyr::pivot_longer(cols = -marker, names_to = "sample", values_to = "x_int")
y_int_long <- y_intensities %>%
tidyr::pivot_longer(cols = -marker, names_to = "sample", values_to = "y_int")
long_intensities <- cbind(x_int_long, y_int_long)
} else {
print("Source intensity data frames have non-identical structure; exiting")
}
## Joining slimmer intensity files with marker metadata and reducing to Chromosome X markers
long_X_intensities <- long_intensities[,c(1,2,3,6)] %>%
dplyr::left_join(., mm_metadata) %>%
dplyr::filter(chr == "X")
Then we flagged markers with high missingness across all samples and samples with high missigness among all markers.
## Flagging markers and samples based on previous QC steps
flagged_X_intensities <- long_X_intensities %>%
dplyr::mutate(marker_flag = dplyr::if_else(condition = marker %in% above.cutoff$marker,
true = "FLAG",
false = "")) %>%
dplyr::mutate(high_missing_sample = dplyr::if_else(condition = sample %in% high.n.samples$sample,
true = "FLAG",
false = ""))
The first round of inferring predicted sexes used a rough search of the sample name for a proper convention among F1 hybrids, as well as an increasingly large regex search into the sample ID to identify properly named samples from inbred strains.
## First round of predicted sex inference
prelim.predicted.sexes <- flagged_X_intensities %>%
dplyr::mutate(bg = dplyr::case_when(stringr::str_detect(string = sample,
pattern = "F1") == TRUE ~ "F1",
TRUE ~ as.character(sample)),
predicted.sex = dplyr::case_when(stringr::str_detect(string = sample,
pattern = "F1f") == TRUE ~ "f",
stringr::str_detect(string = sample,
pattern = "F1m") == TRUE ~ "m",
TRUE ~ "unknown"))
unknown <- prelim.predicted.sexes %>%
dplyr::filter(predicted.sex == "unknown")
digit.trim <- unknown %>%
dplyr::mutate(mouse.id.1 = stringr::str_sub(sample, -1),
predicted.sex.1 = dplyr::case_when(stringr::str_sub(mouse.id.1, 1, 1) %in% c("m","M") ~ "m",
stringr::str_sub(mouse.id.1, 1, 1) %in% c("f","F") ~ "f",
TRUE ~ "unknown"),
bg = if_else(condition = (predicted.sex.1 == "m" | predicted.sex.1 == "f"),
true = str_replace(string = bg,
pattern = mouse.id.1,
replacement = ""),
false = bg),
mouse.id.3= stringr::str_sub(sample, -3),
predicted.sex.3 = dplyr::case_when(stringr::str_sub(mouse.id.3, 1, 1) %in% c("m","M") ~ "m",
stringr::str_sub(mouse.id.3, 1, 1) %in% c("f","F") ~ "f",
TRUE ~ "unknown"),
bg = if_else(condition = (predicted.sex.3 == "m" | predicted.sex.3 == "f"),
true = str_replace(string = bg,
pattern = mouse.id.3,
replacement = ""),
false = bg),
mouse.id.4 = stringr::str_sub(sample, -4),
predicted.sex.4 = dplyr::case_when(stringr::str_sub(mouse.id.4, 1, 1) %in% c("m","M") ~ "m",
stringr::str_sub(mouse.id.4, 1, 1) %in% c("f","F") ~ "f",
TRUE ~ "unknown"),
bg = if_else(condition = (predicted.sex.4 == "m" | predicted.sex.4 == "f"),
true = str_replace(string = bg,
pattern = mouse.id.4,
replacement = ""),
false = bg),
mouse.id.5 = stringr::str_sub(sample, -5),
mouse.id.5 = stringr::str_replace(string = mouse.id.5, ## a couple symbols in these ids mess up the regex search
pattern = "[:symbol:]",
replacement = ""),
predicted.sex.5 = dplyr::case_when(stringr::str_sub(mouse.id.5, 1, 1) %in% c("m","M") ~ "m",
stringr::str_sub(mouse.id.5, 1, 1) %in% c("f","F") ~ "f",
TRUE ~ "unknown"),
bg = if_else(condition = (predicted.sex.5 == "m" | predicted.sex.5 == "f"),
true = str_replace(string = bg,
pattern = mouse.id.5,
replacement = ""),
false = bg),
mouse.id.6 = stringr::str_sub(sample, -6),
mouse.id.6 = stringr::str_replace(string = mouse.id.6, ## a couple symbols in these ids mess up the regex search
pattern = "[:punct:]",
replacement = ""),
mouse.id.6 = stringr::str_replace(string = mouse.id.6, ## a couple symbols in these ids mess up the regex search
pattern = "[:symbol:]",
replacement = ""),
predicted.sex.6 = dplyr::case_when(stringr::str_sub(mouse.id.6, 1, 1) %in% c("m","M") ~ "m",
stringr::str_sub(mouse.id.6, 1, 1) %in% c("f","F") ~ "f",
TRUE ~ "unknown"),
bg = if_else(condition = (predicted.sex.6 == "m" | predicted.sex.6 == "f"),
true = str_replace(string = bg,
pattern = mouse.id.6,
replacement = ""),
false = bg)) %>%
dplyr::mutate(predicted.sex = dplyr::case_when(predicted.sex.1 == "m" ~ "m",
predicted.sex.3 == "m" ~ "m",
predicted.sex.4 == "m" ~ "m",
predicted.sex.5 == "m" ~ "m",
predicted.sex.6 == "m" ~ "m",
predicted.sex.1 == "f" ~ "f",
predicted.sex.3 == "f" ~ "f",
predicted.sex.4 == "f" ~ "f",
predicted.sex.5 == "f" ~ "f",
predicted.sex.6 == "f" ~ "f",
TRUE ~ "unknown"))
predicted.sexes.strings <- prelim.predicted.sexes %>%
dplyr::filter(predicted.sex != "unknown") %>%
dplyr::bind_rows(.,digit.trim)
## Taking the first marker as a sample
predicted.sex.table <- predicted.sexes.strings %>%
dplyr::filter(marker %in% unique(prelim.predicted.sexes$marker)[1]) %>%
dplyr::select(sample, predicted.sex, bg) %>%
dplyr::group_by(predicted.sex) %>%
dplyr::count()
This captured 94 female samples, 249 male samples, leaving 21 samples of unknown predicted sex from nomenclature alone. These samples exhibit a range of naming inconsistencies.
predicted.sexes.strings %>%
dplyr::filter(predicted.sex == "unknown",
marker == predicted.sexes.strings$marker[[1]]) %>%
dplyr::select(sample, bg)
## sample bg
## 1 34-HG-F1 F1
## 2 36-PG-F1 F1
## 3 BAG74u BAG74u
## 4 BAG99 BAG99
## 5 CAST/EiJ CAST/EiJ
## 6 IN13 IN13
## 7 IN25 IN25
## 8 IN34 IN34
## 9 IN40 IN40
## 10 IN47 IN47
## 11 IN54 IN54
## 12 KOMP cell DNA JM8 KOMP cell DNA JM8
## 13 KOMP cell DNA JM8 KOMP cell DNA JM8
## 14 KOMP cell DNA JM8 KOMP cell DNA JM8
## 15 MWN1026 MWN1026
## 16 MWN1030 MWN1030
## 17 MWN1194 MWN1194
## 18 MWN1198 MWN1198
## 19 MWN1214u MWN1214u
## 20 MWN1279 MWN1279
## 21 MWN1287 MWN1287